{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ur8xi4C7S06n"
      },
      "outputs": [],
      "source": [
        "# Copyright 2025 Google LLC\n",
        "#\n",
        "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
        "# you may not use this file except in compliance with the License.\n",
        "# You may obtain a copy of the License at\n",
        "#\n",
        "#     https://www.apache.org/licenses/LICENSE-2.0\n",
        "#\n",
        "# Unless required by applicable law or agreed to in writing, software\n",
        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
        "# See the License for the specific language governing permissions and\n",
        "# limitations under the License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "JAPoU8Sm5E6e"
      },
      "source": [
        "# Running Qwen 3 with Ollama in Cloud Run for Agents\n",
        "\n",
        "<table align=\"left\">\n",
        "  <td style=\"text-align: center\">\n",
        "    <a href=\"https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/open-models/serving/cloud_run_ollama_qwen3_inference.ipynb\">\n",
        "      <img width=\"32px\" src=\"https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg\" alt=\"Google Colaboratory logo\"><br> Open in Colab\n",
        "    </a>\n",
        "  </td>\n",
        "  <td style=\"text-align: center\">\n",
        "    <a href=\"https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fopen-models%2Fserving%2Fcloud_run_ollama_qwen3_inference.ipynb\">\n",
        "      <img width=\"32px\" src=\"https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN\" alt=\"Google Cloud Colab Enterprise logo\"><br> Open in Colab Enterprise\n",
        "    </a>\n",
        "  </td>\n",
        "  <td style=\"text-align: center\">\n",
        "    <a href=\"https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/open-models/serving/cloud_run_ollama_qwen3_inference.ipynb\">\n",
        "      <img src=\"https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg\" alt=\"Vertex AI logo\"><br> Open in Vertex AI Workbench\n",
        "    </a>\n",
        "  </td>\n",
        "  <td style=\"text-align: center\">\n",
        "    <a href=\"https://github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/serving/cloud_run_ollama_qwen3_inference.ipynb\">\n",
        "      <img width=\"32px\" src=\"https://raw.githubusercontent.com/primer/octicons/refs/heads/main/icons/mark-github-24.svg\" alt=\"GitHub logo\"><br> View on GitHub\n",
        "    </a>\n",
        "  </td>\n",
        "</table>\n",
        "\n",
        "<div style=\"clear: both;\"></div>\n",
        "\n",
        "<b>Share to:</b>\n",
        "\n",
        "<a href=\"https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/serving/cloud_run_ollama_qwen3_inference.ipynb\" target=\"_blank\">\n",
        "  <img width=\"20px\" src=\"https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg\" alt=\"LinkedIn logo\">\n",
        "</a>\n",
        "\n",
        "<a href=\"https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/serving/cloud_run_ollama_qwen3_inference.ipynb\" target=\"_blank\">\n",
        "  <img width=\"20px\" src=\"https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg\" alt=\"Bluesky logo\">\n",
        "</a>\n",
        "\n",
        "<a href=\"https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/serving/cloud_run_ollama_qwen3_inference.ipynb\" target=\"_blank\">\n",
        "  <img width=\"20px\" src=\"https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg\" alt=\"X logo\">\n",
        "</a>\n",
        "\n",
        "<a href=\"https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/serving/cloud_run_ollama_qwen3_inference.ipynb\" target=\"_blank\">\n",
        "  <img width=\"20px\" src=\"https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png\" alt=\"Reddit logo\">\n",
        "</a>\n",
        "\n",
        "<a href=\"https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/open-models/serving/cloud_run_ollama_qwen3_inference.ipynb\" target=\"_blank\">\n",
        "  <img width=\"20px\" src=\"https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg\" alt=\"Facebook logo\">\n",
        "</a>            "
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "84f0f73a0f76"
      },
      "source": [
        "| Author |\n",
        "| --- |\n",
        "| [Vlad Kolesnikov](https://github.com/vladkol) |"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ccd500ae19b5"
      },
      "source": [
        "## Overview"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "b1455cd3766f"
      },
      "source": [
        "<style>\n",
        "td, th {\n",
        "   border: none!important;\n",
        "}\n",
        "</style>\n",
        "<table align=\"left\">\n",
        "  <td style=\"text-align: center\">\n",
        "    <img src=\"https://camo.githubusercontent.com/8793b3b4014d538b367ec8819dcca85e79cb8d910c808fa7849e3cd85e2ebe79/68747470733a2f2f7169616e77656e2d7265732e6f73732d616363656c65726174652d6f766572736561732e616c6979756e63732e636f6d2f6c6f676f5f7177656e332e706e67\" width=\"100px\"/>\n",
        "  </td>\n",
        "  <td style=\"text-align: center\">\n",
        "    <img src=\"https://ollama.com/public/ollama.png\" height=\"50px\"/>\n",
        "  </td>\n",
        "  <td style=\"text-align: center\">\n",
        "    <img src=\"https://google.github.io/adk-docs/assets/agent-development-kit.png\" height=\"50px\">\n",
        "  </td>\n",
        "  <td style=\"text-align: center\">\n",
        "    <img src=\"https://www.gstatic.com/bricks/image/f2e0986a2802c0b6c4be7f1355599d5aadfb21a63b7e9643d96697ff9334a1e1.svg\" height=\"50px\">\n",
        "  </td>\n",
        "</table>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "tvgnzT1CKxrO"
      },
      "source": [
        "> [**Qwen 3**](https://qwenlm.github.io/blog/qwen3/) is the latest generation of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models.\n",
        "It supports thinking and function calling.\n",
        "\n",
        "> **[Cloud Run](https://cloud.google.com/run)**:\n",
        "It's a serverless platform by Google Cloud for running containerized applications. It automatically scales and manages infrastructure, supporting various programming languages. Cloud Run now offers GPU acceleration for AI/ML workloads. With 30 seconds to the first token, Cloud Run is a perfect platform for serving lightweight models.\n",
        "\n",
        "> **Note:** [GPU support in Cloud Run](https://cloud.google.com/run/docs/configuring/services/gpu) is Generally Available.\n",
        "To use the GPU feature, your project must have `Total Nvidia L4 GPU allocation without zonal redundancy, per project per region`.\n",
        "\n",
        "> **[Ollama](ollama.com)**: is an open-source tool for easily running and deploying large language models locally. It offers simple management and usage of LLMs on personal computers or servers.\n",
        "\n",
        "This notebook showcase how to deploy [Qwen 3](https://developers.googleblog.com/en/introducing-gemma3) in Cloud Run,\n",
        "with the objective to an API for running AI Agents built with Google [Agent Development Kit](https://google.github.io/adk-docs/).\n",
        "\n",
        "By the end of this notebook, you will learn how to:\n",
        "\n",
        "1. Deploy Qwen 3 as an OpenAI-compatible API on Cloud Run using Ollama.\n",
        "2. Build a custom container with Ollama to deploy any Large Language Model (LLM) of your choice.\n",
        "3. Make requests to an API hosted on Cloud Run.\n",
        "4. Create and run an Agent that uses Qwen 3.\n",
        "\n",
        "We will build an agent using [Agent Development Kit](https://google.github.io/adk-docs/) - a flexible and modular model-agnostic framework for developing and deploying AI agents."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "61RBz8LLbxCR"
      },
      "source": [
        "## Get started"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "aOiPjM5DEPhK"
      },
      "source": [
        "### Install Google Cloud CLI\n",
        "\n",
        "Make sure you Google Cloud CLI is installed (try running `gcloud version`) or [install it](https://cloud.google.com/sdk/docs/install) before executing this notebook.\n",
        "\n",
        "> If you are running in Colab or Vertex AI workbench, you already have Google Cloud CLI installed."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "b0b84cb331d5"
      },
      "source": [
        "### Install Agent Development Kit and other required packages"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "330f4fbf7da9"
      },
      "outputs": [],
      "source": [
        "%pip install --upgrade --quiet google-genai google-adk litellm"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "HfAVa08RDDJB"
      },
      "source": [
        "### Choose a model, a project, and a region to host the model\n",
        "\n",
        "[Choose a Qwen 3 model](https://ollama.com/library/qwen3) to use, a Google Cloud project to host your Cloud Run service, and a region to host it in.\n",
        "Ollama offers multiple sizes with different quantization.\n",
        "In this notebook, we use [Qwen3:8b](https://ollama.com/library/qwen3:8b) with `Q4_K_M` quantization.\n",
        "\n",
        "For Google Cloud project, if you don't have a project yet:\n",
        "\n",
        "1. [Create a project](https://console.cloud.google.com/projectcreate) in the Google Cloud Console.\n",
        "2. Copy your `Project ID` from the project's [Settings page](https://console.cloud.google.com/iam-admin/settings).\n",
        "\n",
        "The project must have `Total Nvidia L4 GPU allocation without zonal redundancy, per project per region` quota allocated in the selected region.\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "cellView": "form",
        "id": "TV0pbqJHDDJB"
      },
      "outputs": [],
      "source": [
        "# { display-mode: \"form\", run: \"auto\" }\n",
        "\n",
        "MODEL = \"qwen3:8b\"  # @param {type:\"string\", isTemplate: true}\n",
        "\n",
        "PROJECT_ID = \"[your-project-id]\"  # @param {type:\"string\", isTemplate: true}\n",
        "REGION = \"us-central1\"  # @param {type:\"string\", isTemplate: true}\n",
        "\n",
        "if PROJECT_ID == \"[your-project-id]\" or not PROJECT_ID:\n",
        "    print(\"Please specify your project id in PROJECT_ID variable.\")\n",
        "    raise KeyboardInterrupt\n",
        "\n",
        "MODEL_NAME_ESCAPED = MODEL.translate(str.maketrans(\".:/\", \"---\"))\n",
        "SERVICE_NAME = f\"ollama--{MODEL_NAME_ESCAPED}\""
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "6c36c31ee2de"
      },
      "outputs": [],
      "source": [
        "### Python dependency imports\n",
        "import os\n",
        "import subprocess\n",
        "from datetime import datetime"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "dmWOrTJ3gx13"
      },
      "source": [
        "### Authenticate your Google Cloud account\n",
        "\n",
        "Depending on your Jupyter environment, you may have to manually authenticate. Run the cell below."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Xc8Jm1P3Y7fs"
      },
      "outputs": [],
      "source": [
        "!gcloud auth print-identity-token -q &> /dev/null || gcloud auth login --project=\"{PROJECT_ID}\" --update-adc --quiet"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "l728UOEPDDJB"
      },
      "source": [
        "## Prepare serving container\n",
        "\n",
        "First, let's create a Docker file for a container with the model embedded into it."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "f56f9803255a"
      },
      "outputs": [],
      "source": [
        "!rm -f Dockerfile\n",
        "!echo \"ARG _MODEL=\\\"{MODEL}\\\"\" > Dockerfile"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "glBn9gPKDDJB"
      },
      "outputs": [],
      "source": [
        "%%writefile -a Dockerfile\n",
        "\n",
        "FROM ollama/ollama:latest\n",
        "ARG _MODEL\n",
        "\n",
        "# Set the host and port to listen on\n",
        "ENV OLLAMA_HOST 0.0.0.0:8080\n",
        "\n",
        "# Set the directory to store model weight files\n",
        "ENV OLLAMA_MODELS /models\n",
        "\n",
        "# Reduce the verbosity of the logs\n",
        "ENV OLLAMA_DEBUG false\n",
        "\n",
        "# Do not unload model weights from the GPU\n",
        "ENV OLLAMA_KEEP_ALIVE -1\n",
        "\n",
        "# Start the ollama server and download the model weights\n",
        "RUN ollama serve & sleep 5 && ollama pull $_MODEL\n",
        "\n",
        "# At startup time we start the server and run a dummy request\n",
        "# to request the model to be loaded in the GPU memory\n",
        "ENTRYPOINT [\"/bin/sh\"]\n",
        "CMD [\"-c\", \"ollama serve  & (ollama run $_MODEL 'Say one word' &) && wait\"]"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "vbDiABJcDDJC"
      },
      "source": [
        "## Build Container Image and Deploy Cloud Run Service\n",
        "\n",
        "We are ready to build our container image and deploy Cloud Run service.\n",
        "\n",
        "The script below performs the following actions:\n",
        "\n",
        "* Enables necessary APIs.\n",
        "* Creates an Artifact Repository for the image.\n",
        "* Creates a Service Account for the service.\n",
        "* Submits a Cloud Build job to create and push the container image.\n",
        "* Deploys the Cloud Run service.\n",
        "\n",
        "> The script may take 10-45 minutes to finish.\n",
        "\n",
        "Note the following important flags in Cloud Build deployment command:\n",
        "\n",
        "* `--concurrency 4` is set to match the value of the environment variable `OLLAMA_NUM_PARALLEL`.\n",
        "* `--gpu 1` with `--gpu-type nvidia-l4` assigns 1 NVIDIA L4 GPU to every Cloud Run instance in the service.\n",
        "* `--no-gpu-zonal-redundancy` allows using the default GPU quota.\n",
        "* `--no-allow-authenticated` restricts unauthenticated access to the service.\n",
        "By keeping the service private, you can rely on Cloud Run's built-in [Identity and Access Management (IAM)](https://cloud.google.com/iam) authentication for service-to-service communication.\n",
        "* `--no-cpu-throttling` is required for enabling GPU.\n",
        "* `--service-account` the service identity of the service.\n",
        "* `--max-instances` sets maximum number of instances of the service.\n",
        "It has to be equal to or lower than your project's NVIDIA L4 GPU quota.\n",
        "\n",
        "For optimal GPU utilization, increase `--concurrency`, keeping it within twice the value of `OLLAMA_NUM_PARALLEL`.\n",
        "While this leads to request queuing in Ollama, it can help improve utilization:\n",
        "Ollama instances can immediately process requests from their queue, and the queues help absorb traffic spikes."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "TXg7IYU1DDJC"
      },
      "outputs": [],
      "source": [
        "%%writefile deploy.sh\n",
        "\n",
        "PROJECT_ID=$1\n",
        "REGION=$2\n",
        "MODEL_ID=\"${3}\"\n",
        "SERVICE_NAME=\"${4}\"\n",
        "SERVICE_ACCOUNT=\"ollama-cloud-run-sa\"\n",
        "SERVICE_ACCOUNT_ADDRESS=\"${SERVICE_ACCOUNT}@$PROJECT_ID.iam.gserviceaccount.com\"\n",
        "MAX_INSTANCES=1 # Adjust this value to match your Cloud Run L4 GPU quota\n",
        "\n",
        "echo \"Enabling APIs in project ${PROJECT_ID}.\"\n",
        "gcloud services enable run.googleapis.com \\\n",
        "    cloudbuild.googleapis.com \\\n",
        "    artifactregistry.googleapis.com \\\n",
        "    --project ${PROJECT_ID} \\\n",
        "    --quiet\n",
        "\n",
        "set -e\n",
        "\n",
        "# Creating the service account if doesn't exist.\n",
        "sa_list=$(gcloud iam service-accounts list --quiet --format 'value(email)' --project $PROJECT_ID --filter=email:$SERVICE_ACCOUNT@$PROJECT_ID.iam.gserviceaccount.com 2>/dev/null)\n",
        "if [ -z \"${sa_list}\" ]; then\n",
        "    echo \"Creating Service Account ${SERVICE_ACCOUNT}.\"\n",
        "    gcloud iam service-accounts create $SERVICE_ACCOUNT \\\n",
        "        --project ${PROJECT_ID} \\\n",
        "        --display-name=\"${SERVICE_ACCOUNT} - Cloud Run Service Account\"\n",
        "fi\n",
        "\n",
        "echo \"Deploying Service ${SERVICE_NAME}. It will take a few minutes...\"\n",
        "gcloud beta run deploy $SERVICE_NAME \\\n",
        "    --source . \\\n",
        "    --project=${PROJECT_ID} \\\n",
        "    --service-account $SERVICE_ACCOUNT_ADDRESS \\\n",
        "    --cpu=8 \\\n",
        "    --memory=32Gi \\\n",
        "    --gpu=1 \\\n",
        "    --gpu-type=nvidia-l4 \\\n",
        "    --concurrency 4 \\\n",
        "    --set-env-vars OLLAMA_NUM_PARALLEL=4 \\\n",
        "    --region ${REGION} \\\n",
        "    --no-allow-unauthenticated \\\n",
        "    --max-instances ${MAX_INSTANCES} \\\n",
        "    --no-cpu-throttling \\\n",
        "    --timeout 1h \\\n",
        "    --no-gpu-zonal-redundancy \\\n",
        "    --quiet \\\n",
        "    --no-user-output-enabled\n",
        "\n",
        "rm -f ./Dockerfile # Cleanup\n",
        "\n",
        "SERVICE_URL=$(gcloud run services describe ${SERVICE_NAME} --project=${PROJECT_ID} --region $REGION --format 'value(status.url)' --quiet)\n",
        "echo \"✅ Success!\"\n",
        "echo \"🚀 Service URL: ${SERVICE_URL}\""
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "e6L2dVGyOAxB"
      },
      "outputs": [],
      "source": [
        "!/bin/bash ./deploy.sh \"{PROJECT_ID}\" \"{REGION}\" \"{MODEL}\" \"{SERVICE_NAME}\" && rm -f ./deploy.sh"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "dgaJ62rmDDJC"
      },
      "source": [
        "### Test the deployed service\n",
        "\n",
        "Now, let's test the service you deployed.\n",
        "\n",
        "First, simply by using `cURL`."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "iX7LmWwGDDJC"
      },
      "outputs": [],
      "source": [
        "%%bash -s $MODEL $SERVICE_NAME $PROJECT_ID $REGION\n",
        "\n",
        "PROMPT=\"Hello!\"\n",
        "SERVICE_URL=$(gcloud run services describe ${2} --project ${3} --region ${4} --format 'value(status.url)' --quiet)\n",
        "AUTH_TOKEN=$(gcloud auth print-identity-token -q)\n",
        "\n",
        "curl -s -X POST \"${SERVICE_URL}/api/generate\" \\\n",
        "-H \"Authorization: Bearer ${AUTH_TOKEN}\" \\\n",
        "-H \"Content-Type: application/json\" \\\n",
        "-d '{ \"model\": \"'${1}'\", \"prompt\": \"'${PROMPT}'\", \"max_tokens\": 100, \"stream\": false}'"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "2b60e54c901d"
      },
      "source": [
        "## Create an AI Agent with Qwen 3"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "1e1f3cb538ed"
      },
      "source": [
        "#### Retrieve an Identity Token\n",
        "\n",
        "Cloud Run with authentication by\n",
        "[Google Cloud IAM](https://cloud.google.com/iam/docs/) requires an [identity token](https://cloud.google.com/docs/authentication/get-id-token) in every request's authentication header."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "X-2TbV6tDDJC"
      },
      "outputs": [],
      "source": [
        "auth_id_token = (\n",
        "    subprocess.check_output(\"gcloud auth print-identity-token -q\", shell=True)\n",
        "    .decode()\n",
        "    .strip()\n",
        ")\n",
        "service_url = (\n",
        "    subprocess.check_output(\n",
        "        (\n",
        "            \"gcloud run services describe \"\n",
        "            f\"{SERVICE_NAME} --project={PROJECT_ID} \"\n",
        "            f\"--region={REGION} \"\n",
        "            \"--format='value(status.url)' -q\"\n",
        "        ),\n",
        "        shell=True,\n",
        "    )\n",
        "    .decode()\n",
        "    .strip()\n",
        ")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "2f5ae9755e13"
      },
      "source": [
        "#### Create and run an Agent\n",
        "\n",
        "We create a simple agent that can answer questions about current time.\n",
        "\n",
        "First, we make a tool that returns current user's time with time zone offset."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "d66169bd7a12"
      },
      "outputs": [],
      "source": [
        "def get_current_time() -> str:\n",
        "    \"\"\"Returns user's local time and timezone offset.\n",
        "\n",
        "    Returns:\n",
        "        str: Time in ISO format with user's timezone offset.\n",
        "    \"\"\"\n",
        "    return datetime.now().astimezone().isoformat()\n",
        "\n",
        "\n",
        "print(get_current_time())"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "b1a1ae76f3bb"
      },
      "source": [
        "Now, we create an agent that can use our tool to answer user's question."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "d7c2b8951050"
      },
      "outputs": [],
      "source": [
        "from google.adk import Runner\n",
        "from google.adk.agents import Agent\n",
        "from google.adk.artifacts import InMemoryArtifactService\n",
        "from google.adk.models.lite_llm import LiteLlm\n",
        "from google.adk.sessions import InMemorySessionService, Session\n",
        "from google.genai import types\n",
        "\n",
        "os.environ[\"OLLAMA_API_BASE\"] = service_url  # still required for LiteLlm to work\n",
        "\n",
        "root_agent = Agent(\n",
        "    name=\"time_agent\",\n",
        "    model=LiteLlm(\n",
        "        model=f\"openai/{MODEL}\",\n",
        "        api_base=service_url + \"/v1\",\n",
        "        api_key=auth_id_token,\n",
        "        temperature=0.1,  # for stable function calling\n",
        "    ),\n",
        "    description=(\"Agent to answer questions about current time.\"),\n",
        "    # Agent instructions with `/no_think` for Qwen 3 to run faster.\n",
        "    instruction=\"\"\"\n",
        "        You are a helpful agent who can answer user questions\n",
        "        about the current local time.\n",
        "        Always output date and time is human-readable form.\n",
        "        /no_think\n",
        "    \"\"\",\n",
        "    tools=[get_current_time],\n",
        ")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "0763595c0eff"
      },
      "source": [
        "Initialize the runtime."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "2d8631d9da3f"
      },
      "outputs": [],
      "source": [
        "app_name = \"my_app\"\n",
        "user_id_1 = \"user1\"\n",
        "session_service = InMemorySessionService()\n",
        "artifact_service = InMemoryArtifactService()\n",
        "runner = Runner(\n",
        "    app_name=app_name,\n",
        "    agent=root_agent,\n",
        "    artifact_service=artifact_service,\n",
        "    session_service=session_service,\n",
        ")\n",
        "new_session = await session_service.create_session(app_name=app_name, user_id=user_id_1)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "64b9387644c6"
      },
      "source": [
        "And run our agent!"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "f09d7a5e0ca8"
      },
      "outputs": [],
      "source": [
        "async def run_prompt(session: Session, new_message: str):\n",
        "    content = types.Content(role=\"user\", parts=[types.Part.from_text(text=new_message)])\n",
        "    print(\"** User says:\", new_message)\n",
        "    async for event in runner.run_async(\n",
        "        user_id=user_id_1,\n",
        "        session_id=session.id,\n",
        "        new_message=content,\n",
        "    ):\n",
        "        if not event.content or not event.content.parts:\n",
        "            continue\n",
        "        print(f\"** {event.author}:\")\n",
        "        for part in event.content.parts:\n",
        "            if part.function_call and part.function_call.name:\n",
        "                print(\n",
        "                    f\"\\t#### Calling `{part.function_call.name}` \"\n",
        "                    f\"with args: {part.function_call.args}\"\n",
        "                )\n",
        "            elif part.function_response and part.function_response.response:\n",
        "                print(f\"\\t### Function call result: {part.function_response.response}\")\n",
        "            elif part.text and part.text.strip():\n",
        "                print(f\"\\t{part.text.strip()}\")\n",
        "\n",
        "\n",
        "QUESTION = \"What time is it now?\"\n",
        "await run_prompt(new_session, QUESTION)  # if not Jupyter, wrap in asyncio.run"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "FFL2V_ClDDJD"
      },
      "source": [
        "## Conclusion\n",
        "Congratulations! You can now use Qwen 3 for running your AI Agents built with Agent Development Kit in Cloud Run!"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "f6f17f9aff65"
      },
      "source": [
        "## Cleaning up"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "s1blF2ziDDJD"
      },
      "source": [
        "To delete the Cloud Run service you created, you can uncomment and run the following cell."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "VbhAz7-9DDJD"
      },
      "outputs": [],
      "source": [
        "# !gcloud run services delete $SERVICE_NAME --project $PROJECT_ID --region $LOCATION --quiet"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "name": "cloud_run_ollama_qwen3_inference.ipynb",
      "toc_visible": true
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}
